{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Solutions of autochecker for chinese"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### 1. Construct a detecter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Step1 : construct a dict to detect the misspelled chinese phrase\n",
    "# key is the chinese word, value is its corresponding frequency appeared in corpus\n",
    "# you can finish this step by collecting corpus from the internet\n",
    "# or you can choose a more easy way, load some dicts already created by others"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def construct_dict( file_path ):\n",
    "    \n",
    "    word_freq = {}\n",
    "    with open(file_path, \"r\") as f:\n",
    "        for line in f:\n",
    "            info = line.split()\n",
    "            word = info[0]\n",
    "            frequency = info[1]\n",
    "            word_freq[word] = frequency\n",
    "    \n",
    "    return word_freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "FILE_PATH = \"./token_freq_pos%40350k_jieba.txt\"\n",
    "\n",
    "phrase_freq = construct_dict( FILE_PATH )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<type 'dict'>\n",
      "349045\n"
     ]
    }
   ],
   "source": [
    "print( type(phrase_freq) )\n",
    "print( len(phrase_freq) )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### 2. Construct an autocorrecter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import pinyin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# list for chinese words\n",
    "# read from the words.dic\n",
    "def load_cn_words_dict( file_path ):\n",
    "    cn_words_dict = \"\"\n",
    "    with open(file_path, \"r\") as f:\n",
    "        for word in f:\n",
    "            cn_words_dict += word.strip().decode(\"utf-8\")\n",
    "    return cn_words_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# function calculate the edite distance from the chinese phrase \n",
    "def edits1(phrase, cn_words_dict):\n",
    "    \"All edits that are one edit away from `phrase`.\"\n",
    "    phrase = phrase.decode(\"utf-8\")\n",
    "    splits     = [(phrase[:i], phrase[i:])  for i in range(len(phrase) + 1)]\n",
    "    deletes    = [L + R[1:]                 for L, R in splits if R]\n",
    "    transposes = [L + R[1] + R[0] + R[2:]   for L, R in splits if len(R)>1]\n",
    "    replaces   = [L + c + R[1:]             for L, R in splits if R for c in cn_words_dict]\n",
    "    inserts    = [L + c + R                 for L, R in splits for c in cn_words_dict]\n",
    "    return set(deletes + transposes + replaces + inserts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# return the phrease exist in phrase_freq\n",
    "def known(phrases): return set(phrase for phrase in phrases if phrase.encode(\"utf-8\") in phrase_freq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# get the candidates phrase of the error phrase\n",
    "# we sort the candidates phrase's importance according to their pinyin\n",
    "# if the candidate phrase's pinyin exactly matches with the error phrase, we put them into first order\n",
    "# if the candidate phrase's first word pinyin matches with the error phrase first word, we put them into second order\n",
    "# else we put candidate phrase into the third order\n",
    "def get_candidates( error_phrase ):\n",
    "    \n",
    "    candidates_1st_order = []\n",
    "    candidates_2nd_order = []\n",
    "    candidates_3nd_order = []\n",
    "    \n",
    "    error_pinyin = pinyin.get(error_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n",
    "    cn_words_dict = load_cn_words_dict( \"./cn_dict.txt\" )\n",
    "    candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) )\n",
    "    \n",
    "    for candidate_phrase in candidate_phrases:\n",
    "        candidate_pinyin = pinyin.get(candidate_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n",
    "        if candidate_pinyin == error_pinyin:\n",
    "            candidates_1st_order.append(candidate_phrase)\n",
    "        elif candidate_pinyin.split(\"/\")[0] == error_pinyin.split(\"/\")[0]:\n",
    "            candidates_2nd_order.append(candidate_phrase)\n",
    "        else:\n",
    "            candidates_3nd_order.append(candidate_phrase)\n",
    "    \n",
    "    return candidates_1st_order, candidates_2nd_order, candidates_3nd_order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def auto_correct( error_phrase ):\n",
    "    \n",
    "    c1_order, c2_order, c3_order = get_candidates(error_phrase)\n",
    "    # print c1_order, c2_order, c3_order\n",
    "    if c1_order:\n",
    "        return max(c1_order, key=phrase_freq.get )\n",
    "    elif c2_order:\n",
    "        return max(c2_order, key=phrase_freq.get )\n",
    "    else:\n",
    "        return max(c3_order, key=phrase_freq.get )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "呕涂 呕吐\n",
      "东方之朱 东方之珠\n",
      "沙拢 沙龙\n"
     ]
    }
   ],
   "source": [
    "# test for the auto_correct \n",
    "error_phrase_1 = \"呕涂\" # should be \"呕吐\"\n",
    "error_phrase_2 = \"东方之朱\" # should be \"东方之珠\"\n",
    "error_phrase_3 = \"沙拢\" # should be \"沙龙\"\n",
    "\n",
    "print error_phrase_1, auto_correct( error_phrase_1 )\n",
    "print error_phrase_2, auto_correct( error_phrase_2 )\n",
    "print error_phrase_3, auto_correct( error_phrase_3 )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### 3. Correct the misspelled phrase in a sentance "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# step 3 : Tokenization\n",
    "# For any given sentence, use jieba do the segmentation\n",
    "# Get segment list after segmentation is done\n",
    "# check if the remain phrase exists in word_freq dict\n",
    "# if not, then it is a misspelled phrase\n",
    "# use auto_correct fun to correct the phrase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import jieba\n",
    "import string\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "PUNCTUATION_LIST = string.punctuation\n",
    "PUNCTUATION_LIST += \"。，？：；｛｝［］‘“”《》／！％……（）\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def auto_correct_sentence( error_sentence, verbose=True):\n",
    "    \n",
    "    jieba_cut = jieba.cut(err_test.decode(\"utf-8\"), cut_all=False)\n",
    "    seg_list = \"\\t\".join(jieba_cut).split(\"\\t\")\n",
    "    \n",
    "    correct_sentence = \"\"\n",
    "    \n",
    "    for phrase in seg_list:\n",
    "        \n",
    "        correct_phrase = phrase\n",
    "        # check if item is a punctuation\n",
    "        if phrase not in PUNCTUATION_LIST.decode(\"utf-8\"):\n",
    "            # check if the phrase in our dict, if not then it is a misspelled phrase\n",
    "            if phrase.encode(\"utf-8\") not in phrase_freq.keys():\n",
    "                correct_phrase = auto_correct(phrase.encode(\"utf-8\"))\n",
    "                if verbose :\n",
    "                    print phrase, correct_phrase\n",
    "    \n",
    "        correct_sentence += correct_phrase\n",
    "    \n",
    "    if verbose:\n",
    "        print correct_sentence\n",
    "    return correct_sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "机七 机器\n",
      "领遇 领域\n",
      "分知 分枝\n",
      "机器学习是人工智能领域最能体现智能的一个分枝！\n"
     ]
    }
   ],
   "source": [
    "err_sent = '机七学习是人工智能领遇最能体现智能的一个分知！'\n",
    "correct_sent = auto_correct_sentence( err_sent )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "机器学习是人工智能领域最能体现智能的一个分枝！\n"
     ]
    }
   ],
   "source": [
    "print correct_sent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlp_interview",
   "language": "python",
   "name": "nlp_interview"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
